Load Packages

In [1]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Lars, Ridge, ElasticNet, LassoLars, LassoLarsCV, LinearRegression
import re
from umap import UMAP
import requests
import pandas as pd
from bs4 import BeautifulSoup
import seaborn as sns
import matplotlib.pyplot as plt
import gower
import pickle
from collections import Counter
import plotly.express as px
from xgboost import XGBRFRegressor
import shap
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

# import the real estate price analytics library
from lib.real_estate_analytics_library import *
In [2]:
# optional - suppress warnings
import warnings
warnings.filterwarnings('ignore')

Scrape Property Price Data

In [3]:
# the root page link is used to generate the links for all pages
root = 'https://en.comparis.ch/immobilien/result/list?requestobject=%7B%22DealType%22%3A%2210%22%2C%22LocationSearchString%22%3A%22Zurich%22%2C%22RootPropertyTypes%22%3A%5B%220%22%5D%2C%22PriceTo%22%3A%22-10%22%2C%22RoomsFrom%22%3A%22-10%22%2C%22Sort%22%3A%2211%22%2C%22AdAgeMax%22%3A-1%2C%22ComparisPointsMin%22%3A-1%2C%22SiteId%22%3A-1%7D&sort=11&page='
In [4]:
# Open provided link using the requests package
# get the properties in Zürich, using the Comparis link for this result
links_page = requests.get('https://en.comparis.ch/immobilien/result/list?requestobject=%7B%22DealType%22%3A%2210%22%2C%22LocationSearchString%22%3A%22Zurich%22%2C%22RootPropertyTypes%22%3A%5B%220%22%5D%2C%22PriceTo%22%3A%22-10%22%2C%22RoomsFrom%22%3A%22-10%22%2C%22Sort%22%3A%2211%22%2C%22AdAgeMax%22%3A-1%2C%22ComparisPointsMin%22%3A-1%2C%22SiteId%22%3A-1%7D&sort=11')
In [5]:
soup = BeautifulSoup(links_page.content, 'html.parser')
In [6]:
# get the page number links
links = list(l['href'] for l in soup.find_all("a",{"class":"css-1yj1f35 excbu0j4"}))
In [7]:
# get the number of pages available for the location in question
num_pages = int(links[-2][links[-2].find('page=') + 5:]) + 1
In [8]:
# generate the list of pages that contain properties for the location in question
property_links = [root + str(i) for i in range(0, num_pages, 1)]
In [9]:
# define the root that we will comine with the property ID, giving us the page for each property
root = 'https://en.comparis.ch/immobilien/marktplatz/details/show/'
In [10]:
# define the list for storing the specific page for each property
pages = []

for property_link in property_links:    
    page = requests.get(property_link)

    soup = BeautifulSoup(page.content, 'html.parser')

    raw_id_list = re.findall(r'"AdId":[-+]?[0-9]+,', str(soup))

    id_list = [raw_id[raw_id.find(':') + 1:raw_id.find(',')] for raw_id in raw_id_list]

    # comine the root with the property ID, giving us the page for each property
    pages.extend([root + i for i in id_list])
In [11]:
# get the attributes for each property from the Comparis website
properties = []

for p in pages:
    page = requests.get(p)
    soup = BeautifulSoup(page.content, 'html.parser')
    property_address = list(soup.find("h3",{"class":"text-green"}))
    property_attributes = list(soup.find("dl",{"class":"row xsmall-up-2 medium-up-3 large-up-4 attributes-grid"}).stripped_strings)
    properties.append([property_address, property_attributes])
In [12]:
# check the length of the property attributes list
len(properties)
Out[12]:
1000
In [13]:
# define the list of attributes that will be gathered from the scraped data
property_type = []
gross_rent = []
net_rent = []
living_space = []
rooms = []
floor = []
available_date = []
public_transport = []
motorway = []
shop = []
In [14]:
# flatten the property address list
property_address = [record[0][0] for record in properties]
In [15]:
# cycle through the scraped property data and separate it into attribute-based lists that will be used to 
# create a pandas DataFrame

for record in properties:
    
    try:
        property_type.append(record[1][record[1].index('Property type') + 1])
    except:
        property_type.append(None)
    
    try:
        gross_rent.append(float(record[1][record[1].index('Rent per month') + 1][4:].replace(',','')))
    except:
        gross_rent.append(None)
    
    try:
        net_rent.append(float(record[1][record[1].index('Rent per month (without charges)') + 1][4:].replace(',','')))
    except:
        net_rent.append(None)
    
    try:
        living_space.append(float(record[1][record[1].index('Living space') + 1][:-3]))
    except:
        living_space.append(None)
    
    try:
        rooms.append(get_num_rooms(record[1][record[1].index('Rooms') + 1]))
    except:
        rooms.append(None)
    
    try:
        floor.append(record[1][record[1].index('Floor') + 1])
    except:
        floor.append(None)
    
    try:
        available_date.append(record[1][record[1].index('Available') + 1])
    except:
        available_date.append(None)
    
    try:
        public_transport.append(float(record[1][record[1].index('Public transport stop') + 1][:-2]))
    except:
        public_transport.append(None)
    
    try:
        motorway.append(float(record[1][record[1].index('Motorway') + 1][:-2]))
    except:
        motorway.append(None)
    
    try:
        shop.append(float(record[1][record[1].index('Shops') + 1][:-2]))
    except:
        shop.append(None)
In [16]:
# create a pandas DataFrame that contains the raw attributes that we have gathered
property_records = pd.DataFrame(list(zip(property_address, property_type, gross_rent, net_rent, living_space, rooms, floor, available_date, public_transport, motorway, shop)), columns =['property_address', 'property_type', 'gross_rent', 'net_rent', 'living_space', 'rooms', 'floor', 'available_date', 'public_transport', 'motorway', 'shop'])
In [17]:
# show DataFrame
property_records
Out[17]:
property_address property_type gross_rent net_rent living_space rooms floor available_date public_transport motorway shop
0 8003 Zürich ZH Commercial property 300.0 NaN NaN 1.0 None By arrangement NaN NaN NaN
1 8006 Zürich ZH Other 755.0 NaN NaN 1.0 None By arrangement NaN NaN NaN
2 8003 Zürich ZH Apartment 1390.0 NaN NaN 2.0 None By arrangement NaN NaN NaN
3 8038 Zürich ZH Apartment 1150.0 NaN NaN 1.5 None By arrangement NaN NaN NaN
4 8002 Zürich ZH Apartment 770.0 NaN NaN 1.5 None By arrangement NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ...
995 Kreis 7, am Römerhof, 8032 Zürich Commercial property 1850.0 1460.0 NaN 2.0 Ground floor 01/06/2021 50.0 NaN 50.0
996 Uraniastrasse 14, 8001 Zürich Commercial property NaN NaN NaN NaN 4. floor Immediately 50.0 2500.0 10.0
997 8001 Zürich Commercial property 18700.0 NaN NaN NaN Ground floor None NaN NaN NaN
998 Edenstrasse 5, 8045 Zürich Commercial property NaN NaN NaN NaN Ground floor Immediately NaN NaN NaN
999 8041 Zürich Apartment 2075.0 1650.0 82.0 4.0 2. floor None NaN NaN NaN

1000 rows × 11 columns

In [18]:
# save the scraped property records
property_records.to_csv('data/property_records_rent.csv')

Process Data

In this section, we process the scraped web data. This involves encoding all features as the appropriate data type and performing imputation (i.e. encoding missing data points as the mean, median or mode of the existing data.

In [19]:
# load data
property_records = pd.read_csv('data/property_records_rent.csv')
In [20]:
# display the ratio of missing values for the below features

print('gross_rent:', property_records.loc[property_records['gross_rent'].isna() == True].shape[0]/property_records.shape[0])
print('living_space:', property_records.loc[property_records['living_space'].isna() == True].shape[0]/property_records.shape[0])
print('rooms:', property_records.loc[property_records['rooms'].isna() == True].shape[0]/property_records.shape[0])
print('property_address:', property_records.loc[property_records['property_address'].isna() == True].shape[0]/property_records.shape[0])
print('floor:', property_records.loc[property_records['floor'].isna() == True].shape[0]/property_records.shape[0])
print('property_type:', property_records.loc[property_records['property_type'].isna() == True].shape[0]/property_records.shape[0])
print('shop:', property_records.loc[property_records['shop'].isna() == True].shape[0]/property_records.shape[0])
print('public_transport:', property_records.loc[property_records['public_transport'].isna() == True].shape[0]/property_records.shape[0])
print('motorway:', property_records.loc[property_records['motorway'].isna() == True].shape[0]/property_records.shape[0])
gross_rent: 0.127
living_space: 0.461
rooms: 0.287
property_address: 0.0
floor: 0.361
property_type: 0.0
shop: 0.757
public_transport: 0.737
motorway: 0.904
In [21]:
# process the data for use in a price prediction model, pricing analytics
property_records = process_records(property_records)
In [22]:
# save the processed property records
property_records.to_csv('data/processed_property_records_rent.csv')
In [23]:
# save the possible values for each feature
with open('data/possible_postcodes.pickle', 'wb') as handle:
    pickle.dump(list(property_records['property_postcode'].unique()), handle)

with open('data/possible_floors.pickle', 'wb') as handle:
    pickle.dump(list(property_records['floor'].unique()), handle)

with open('data/possible_types.pickle', 'wb') as handle:
    pickle.dump(list(property_records['property_type'].unique()), handle)

Model Selection and Training

In this section we will select, train and save two models - one tree-based model, and one linear regression-based model. The tree-based model will be selected because it has a lower mean absolute error, while the linear regression-based model will be used to extrapolate the price of real estate that falls outside of the range of the training data (i.e. very high-value real estate), since tree-based models cannot predict values that are higher than the highest target value in the dataset on which they are trained.

Note: the linear model assumes that there is a linear relationship between price and other features such as living space and number of rooms for larger properties outside of the dataset.

The methodology used in this Jupyter notebook assumes stability in the price data for the records that were scraped - that is, we assume that the prices did not significantly change over the time period covered by the property listings.

In [24]:
# load data
property_records = pd.read_csv('data/processed_property_records_rent.csv')
In [25]:
x = property_records[[col for col in property_records.columns if col not in ['gross_rent', 'net_rent', 'Unnamed: 0', 'property_address', 'available_date', 'property_type', 'floor', 'property_postcode']]]
y = property_records['gross_rent']
In [26]:
fig = px.scatter(property_records, x="rooms", y="gross_rent", color="property_type", title="Gross Rent vs Number of Rooms", hover_data=['property_postcode'])
fig.show()
In [27]:
fig = px.box(property_records, x="rooms", y="gross_rent", title="Gross Rent vs Number of Rooms", points=False)
fig.show()
In [28]:
fig = px.scatter(property_records, x="living_space", y="gross_rent", color="property_type", title="Gross Rent vs Living Space", hover_data=['property_postcode'])
fig.show()
In [29]:
fig = px.box(property_records, x="property_postcode", y="gross_rent", title="Gross Rent vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [30]:
fig = px.box(property_records, x="floor", y="gross_rent", title="Gross Rent vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [31]:
fig = px.box(property_records, x="property_type", y="gross_rent", title="Gross Rent vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [32]:
# scale the float features
columns = ['living_space', 'rooms', 'public_transport', 'motorway', 'shop']
scaler = StandardScaler().fit(x[columns])
scaled = scaler.transform(x[columns])
scaled = pd.DataFrame(scaled, columns=['scaled_' + column for column in columns])
x = pd.concat([x, scaled], axis=1)
In [33]:
# save the scaler model for later use
with open('data/scaler.pickle', 'wb') as handle:
    pickle.dump(scaler, handle)
In [34]:
x = x.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop'])
In [35]:
# calculate the correlation_matrix matrix of the features and the dependent variable
correlation_matrix = property_records[[col for col in property_records.columns if col not in ['net_rent', 'Unnamed: 0', 'property_address', 'available_date', 'property_type', 'floor', 'property_postcode']]].corr().loc[['gross_rent']].drop(['gross_rent'], axis=1)
In [36]:
# visualize the correlation_matrix matrix
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(correlation_matrix, square=True, vmin=-1, vmax=1, ax=ax, linewidths=1, xticklabels=correlation_matrix.columns, cmap="Blues")
plt.yticks(rotation=0)
plt.show()
In [37]:
get_vifs(x)
Out[37]:
[('8000', inf), ('8001', inf), ('8002', inf), ('8003', inf), ('8004', inf), ('8005', inf), ('8006', inf), ('8008', inf), ('8032', inf), ('8037', inf), ('8038', inf), ('8041', inf), ('8044', inf), ('8045', inf), ('8046', inf), ('8047', inf), ('8048', inf), ('8049', inf), ('8050', inf), ('8051', inf), ('8052', inf), ('8053', inf), ('8055', inf), ('8057', inf), ('8058', inf), ('8064', inf), ('1. floor', inf), ('10. floor', inf), ('11. floor', inf), ('12. floor', inf), ('14. floor', inf), ('17. floor', inf), ('19. floor', inf), ('2. floor', inf), ('21. floor', inf), ('22. floor', inf), ('3. floor', inf), ('4. floor', inf), ('5. floor', inf), ('6. floor', inf), ('7. floor', inf), ('8. floor', inf), ('Basement', inf), ('Ground floor', inf), ('Apartment', inf), ('Apartment with terrace', inf), ('Attic apartment', inf), ('Commercial property', inf), ('Furnished apartment', inf), ('Hobby room', inf), ('Loft', inf), ('Maisonette', inf), ('Other', inf), ('Parking space', inf), ('Parking space, garage', inf), ('Penthouse', inf), ('Shared apartment', inf), ('Single garage', inf), ('Single room', inf), ('Single-family house', inf), ('Studio', inf), ('Terraced/row house', inf), ('Underground garage', inf), ('Villa', inf), ('scaled_living_space', 2.128016636212446), ('scaled_rooms', 2.1226055168994806), ('scaled_public_transport', 1.4736299045307317), ('scaled_motorway', 1.4636964859128578), ('scaled_shop', 1.312334495610808)]

The above VIFs indicate, as expected, serious multicolinearty in the data. This is because of the the one hot encoding of the categorical data. In order to fix this problem, we can eliminate a column from each of the categorical feature sets. We will select the columns below, based on their frequency in the data. This should not result in any significant loss in the performance of the model, as the removed values will still be indicated in the data (because all of the remaining columns/features will be 0 if the removed value is present). For example, if we remove the 'Apartment' encoding, then any record for an apartment will have all other property_type encoding set to 0 (e.g features such as 'Single garage' will all be equal to 0).

In [38]:
sorted(Counter(property_records['property_postcode']).items(), key=lambda v: v[1], reverse=True)
Out[38]:
[(8004, 66), (8008, 57), (8050, 54), (8048, 53), (8002, 51), (8003, 50), (8001, 50), (8052, 49), (8032, 49), (8006, 47), (8005, 47), (8049, 38), (8057, 37), (8038, 28), (8051, 28), (8045, 28), (8046, 26), (8037, 25), (8047, 24), (8055, 20), (8044, 15), (8053, 14), (8041, 11), (8064, 4), (8000, 1), (8058, 1)]
In [39]:
sorted(Counter(property_records['floor']).items(), key=lambda v: v[1], reverse=True)
Out[39]:
[('1. floor', 420), ('Ground floor', 108), ('2. floor', 105), ('3. floor', 91), ('4. floor', 52), ('Basement', 40), ('5. floor', 28), ('6. floor', 10), ('8. floor', 4), ('7. floor', 3), ('12. floor', 2), ('11. floor', 2), ('17. floor', 2), ('21. floor', 2), ('22. floor', 1), ('14. floor', 1), ('10. floor', 1), ('19. floor', 1)]
In [40]:
sorted(Counter(property_records['property_type']).items(), key=lambda v: v[1], reverse=True)
Out[40]:
[('Apartment', 401), ('Commercial property', 124), ('Other', 94), ('Underground garage', 58), ('Furnished apartment', 58), ('Parking space', 43), ('Single room', 18), ('Shared apartment', 17), ('Penthouse', 16), ('Attic apartment', 7), ('Maisonette', 7), ('Single garage', 6), ('Studio', 6), ('Parking space, garage', 6), ('Hobby room', 4), ('Loft', 3), ('Single-family house', 2), ('Terraced/row house', 1), ('Villa', 1), ('Apartment with terrace', 1)]
In [41]:
# define the columns that are to be eliminated from the input features to the Linear Regression model. This is to 
# eliminated multicolinearity.
eliminated_columns = ['8001', '1. floor', 'Apartment']
In [42]:
# The below VIFs for the reduced data indicate no multicolinearity.
get_vifs(x.drop(columns=eliminated_columns))
Out[42]:
[('scaled_living_space', 2.124658017944176), ('scaled_rooms', 2.1214900955945324), ('Underground garage', 1.8472720949801487), ('Basement', 1.7379379381356073), ('Other', 1.539125112595943), ('scaled_public_transport', 1.4725648219772833), ('scaled_motorway', 1.4636084755142376), ('8064', 1.4397339113777705), ('Terraced/row house', 1.3997940230341082), ('Ground floor', 1.3498323517453095), ('Commercial property', 1.3391933979345318), ('2. floor', 1.3225242758220375), ('8050', 1.3168955642205091), ('scaled_shop', 1.3119962231936757), ('8006', 1.27782624628705), ('3. floor', 1.2683204512009354), ('8004', 1.2654438394214258), ('8005', 1.2650069591686777), ('Furnished apartment', 1.2525427729188645), ('Maisonette', 1.243280175945684), ('Shared apartment', 1.2391390549515637), ('8008', 1.2087731149565042), ('19. floor', 1.2056436291037183), ('8032', 1.2013602865536188), ('8002', 1.1995614775102181), ('12. floor', 1.1975748062867477), ('8049', 1.19694651785077), ('Parking space', 1.1913768263757403), ('8052', 1.1742784420423265), ('4. floor', 1.1741105559153413), ('8041', 1.1733508628787617), ('5. floor', 1.1702144605857552), ('8003', 1.160217269760856), ('Single-family house', 1.160145996274261), ('8048', 1.1593890719466486), ('Single room', 1.1590547068656385), ('8053', 1.151298874692645), ('8046', 1.1286375971666824), ('8045', 1.127270292421176), ('8047', 1.1186073571078725), ('Penthouse', 1.1177877489380605), ('8057', 1.1111920248457974), ('6. floor', 1.1106855208478705), ('8051', 1.1095931983914649), ('Parking space, garage', 1.1000339550857383), ('8038', 1.0976255500176357), ('8044', 1.094818095294804), ('8055', 1.0943244048481735), ('Hobby room', 1.082829575097201), ('Apartment with terrace', 1.0824107662938434), ('8058', 1.0764260370942107), ('Villa', 1.0722144668506037), ('8037', 1.0712843928789852), ('8. floor', 1.0696308054514037), ('Single garage', 1.069008624477743), ('8000', 1.0685569845734144), ('21. floor', 1.067619704294715), ('Studio', 1.0674701863649867), ('Loft', 1.0651071698639503), ('Attic apartment', 1.0647749821894652), ('14. floor', 1.0564890196089818), ('11. floor', 1.0492747160926146), ('7. floor', 1.0444921910153726), ('10. floor', 1.039767526053991), ('22. floor', 1.0282022943397713), ('17. floor', 1.0252503319828892)]
In [43]:
# save the list of eliminated columns for later use
with open('data/eliminated_columns.pickle', 'wb') as handle:
    pickle.dump(eliminated_columns, handle)
In [44]:
# remove the outliers detected by Tukey's test - this reduced dataset will be used in the training of the linear 
# models
xe, ye = remove_outliers_tukeys_test(x.drop(columns=eliminated_columns), y)
In [45]:
# use the Gower distance to scale the data for inout into UMAP dimensionality-reduction, which takes into account
# the float inputs and their interaction with the one hot-encoded data
umap_results = UMAP(n_neighbors=20).fit_transform(gower.gower_matrix(pd.concat([y, x], axis=1)))
In [46]:
outlier_indices = get_outliers_isolation_forest(x, y, n_estimators=100, contamination=0.06)
normal_indices = [i for i in range(0, x.shape[0], 1) if i not in outlier_indices]
In [47]:
outliers = pd.DataFrame(zip([v[0] for v in umap_results[outlier_indices]], [v[1] for v in umap_results[outlier_indices]], ['Outlier' for i in range(0, len(outlier_indices), 1)]), columns=['Dimension 1', 'Dimension 2', 'Status'])
In [48]:
normal = pd.DataFrame(zip([v[0] for v in umap_results[normal_indices]], [v[1] for v in umap_results[normal_indices]], ['Normal' for i in range(0, len(outlier_indices), 1)]), columns=['Dimension 1', 'Dimension 2', 'Status'])
In [49]:
# save the UMAP results as a pandas DataFrame
umap_data = pd.concat([normal, outliers]).reset_index(drop=True)
In [50]:
# plot the UMAP results, showing the outliers vs normal data points, based on the isolation forest model
fig = px.scatter(umap_data, x="Dimension 1", y="Dimension 2", color="Status", title="UMAP Result", hover_data=[umap_data.index.values])
fig.show()
In [51]:
# remove the outliers detected by the isolation forest - this reduced dataset will be used in the training of the 
# tree-based models
xt, yt = remove_outliers_isolation_forest(x, y, n_estimators=100, contamination=0.06)
In [52]:
model_types = [['Lasso', Lasso()], ['Ridge', Ridge()], ['ElasticNet', ElasticNet()], ['LassoLars', LassoLars()], ['LassoLarsCV', LassoLarsCV()], ['Lars', Lars()], ['LinearRegression', LinearRegression()]]
In [53]:
model_results = train_model(xe, ye, model_types, 5)
In [54]:
# get the top 5 results, selected based on the mae metric
top_models = sorted(model_results, key=lambda v: v[4], reverse=False)[:5]
In [55]:
top_models
Out[55]:
[['LassoLars', LassoLars(alpha=0.03997004780106284, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True, fit_path=True, max_iter=500, normalize=True, positive=False, precompute='auto', verbose=False), 629.2619219779707, 454.9744492947117, 329.87443140783836], ['LassoLarsCV', LassoLarsCV(copy_X=True, cv=None, eps=2.220446049250313e-16, fit_intercept=True, max_iter=500, max_n_alphas=1000, n_jobs=None, normalize=True, positive=False, precompute='auto', verbose=False), 630.0693713715135, 458.6292427729845, 344.136413819003], ['Lasso', Lasso(alpha=0.13886029729300645, copy_X=True, fit_intercept=True, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False), 624.0475607696105, 457.183587877417, 344.69711379312633], ['Ridge', Ridge(alpha=0.2713500297435342, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001), 616.4846051320543, 450.83499240817025, 346.42071656874515], ['ElasticNet', ElasticNet(alpha=0.001336210447858255, copy_X=True, fit_intercept=True, l1_ratio=0.5, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False), 616.9423149642422, 449.36037690407863, 348.41663496195434]]
In [56]:
# train the best model on the expanded dataset
linear_pricing_model = model_results[0][1].fit(xe, ye)
linear_pricing_model_mae = top_models[0][4]
In [57]:
linear_pricing_model
Out[57]:
Lasso(alpha=0.13886029729300645, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
In [58]:
# save the selected model
with open('models/linear_pricing_model.pickle', 'wb') as handle:
    pickle.dump(linear_pricing_model, handle)

# save the model's MAE
with open('models/linear_pricing_model_mae.pickle', 'wb') as handle:
    pickle.dump(linear_pricing_model_mae, handle)
In [59]:
# calculate feature importances based on the regression coefficients
regression_interpretation = pd.DataFrame(sorted(list(zip(xe.columns, linear_pricing_model.coef_)), key=lambda v: abs(v[1]), reverse=False), columns=['Feature', 'Weight'])
In [60]:
# plot the regression corefficient-based feature importances
fig = px.scatter(regression_interpretation, x="Weight", y="Feature")
fig.update_yaxes(type='category')
fig.show()
In [61]:
model_types = [['XGBRFRegressor', XGBRFRegressor()], ['AdaBoostRegressor', AdaBoostRegressor()], ['RandomForestRegressor', RandomForestRegressor()], ['ExtraTreesRegressor', ExtraTreesRegressor()], ['DecisionTreeRegressor', DecisionTreeRegressor()], ['GradientBoostingRegressor', GradientBoostingRegressor()]]
In [62]:
model_results = train_model(xt, yt, model_types, 3)
In [63]:
# get the top 5 results, selected based on the mae metric
top_models = sorted(model_results, key=lambda v: v[4], reverse=False)[:5]
In [64]:
top_models
Out[64]:
[['XGBRFRegressor', XGBRFRegressor(base_score=0.5, booster=None, colsample_bylevel=1, colsample_bynode=0.8, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints=None, learning_rate=1, max_delta_step=0, max_depth=17, min_child_weight=1, missing=nan, monotone_constraints=None, n_estimators=39, n_jobs=0, num_parallel_tree=39, objective='reg:squarederror', random_state=0, reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1, subsample=0.8, tree_method=None, validate_parameters=False, verbosity=None), 958.6053841889067, 510.1221498478994, 278.72304280598956], ['ExtraTreesRegressor', ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse', max_depth=16, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=21, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False), 1138.6871871103747, 568.981291059084, 299.9066968907785], ['RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=20, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=28, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False), 945.8140734084018, 521.9512340527875, 303.9041994579444], ['GradientBoostingRegressor', GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='ls', max_depth=8, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=31, n_iter_no_change=None, presort='deprecated', random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), 1015.5976600882399, 548.6394157874423, 317.37715851752563], ['DecisionTreeRegressor', DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=11, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None, splitter='best'), 1196.1542533499157, 638.1675762686, 337.398119122257]]
In [65]:
# train the best model on the expanded dataset
pricing_model = top_models[0][1].fit(xt, yt)
pricing_model_mae = top_models[0][4]
In [66]:
pricing_model
Out[66]:
XGBRFRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
               colsample_bynode=0.8, colsample_bytree=1, gamma=0, gpu_id=-1,
               importance_type='gain', interaction_constraints=None,
               learning_rate=1, max_delta_step=0, max_depth=17,
               min_child_weight=1, missing=nan, monotone_constraints=None,
               n_estimators=39, n_jobs=0, num_parallel_tree=39,
               objective='reg:squarederror', random_state=0, reg_alpha=0,
               reg_lambda=1e-05, scale_pos_weight=1, subsample=0.8,
               tree_method=None, validate_parameters=False, verbosity=None)
In [67]:
# save the selected model
with open('models/pricing_model.pickle', 'wb') as handle:
    pickle.dump(pricing_model, handle)

# save the model's MAE
with open('models/pricing_model_mae.pickle', 'wb') as handle:
    pickle.dump(pricing_model_mae, handle)
In [68]:
# calculate and show the raw SHAP values for the model
# reference: https://christophm.github.io/interpretable-ml-book/shap.html

# load JS visualization code to notebook
shap.initjs()

explainer = shap.TreeExplainer(pricing_model)
shap_values = explainer.shap_values(xt)

shap.summary_plot(shap_values, xt)
In [69]:
# show the SHAP value-based relative model feature importances
shap.summary_plot(shap_values, xt, plot_type="bar")

Predict the price of any given property

In [70]:
# show the possible values for each feature
with open('data/possible_postcodes.pickle', 'rb') as handle:
    print('Possible Postcodes =', pickle.load(handle))

print('')

with open('data/possible_floors.pickle', 'rb') as handle:
    print('Possible Floors =', pickle.load(handle))

print('')

with open('data/possible_types.pickle', 'rb') as handle:
    print('Possible Property Types =', pickle.load(handle))
Possible Postcodes = ['8003', '8006', '8038', '8002', '8008', '8048', '8049', '8047', '8005', '8004', '8001', '8055', '8052', '8032', '8050', '8051', '8037', '8041', '8045', '8044', '8053', '8057', '8064', '8000', '8046', '8058']

Possible Floors = ['1. floor', '4. floor', '6. floor', '2. floor', '5. floor', 'Ground floor', '3. floor', 'Basement', '7. floor', '12. floor', '8. floor', '22. floor', '14. floor', '10. floor', '11. floor', '19. floor', '17. floor', '21. floor']

Possible Property Types = ['Commercial property', 'Other', 'Apartment', 'Attic apartment', 'Underground garage', 'Penthouse', 'Furnished apartment', 'Parking space', 'Shared apartment', 'Single room', 'Single garage', 'Studio', 'Parking space, garage', 'Maisonette', 'Single-family house', 'Terraced/row house', 'Loft', 'Hobby room', 'Villa', 'Apartment with terrace']
In [71]:
# load data
property_records = pd.read_csv('data/processed_property_records_rent.csv')
In [72]:
# load the pre-trained models and other required data from pickle files
with open('models/pricing_model.pickle', 'rb') as handle:
    pricing_model = pickle.load(handle)

with open('models/pricing_model_mae.pickle', 'rb') as handle:
    pricing_model_mae = pickle.load(handle)

with open('models/linear_pricing_model.pickle', 'rb') as handle:
    linear_pricing_model = pickle.load(handle)

with open('models/linear_pricing_model_mae.pickle', 'rb') as handle:
    linear_pricing_model_mae = pickle.load(handle)
    
with open('data/eliminated_columns.pickle', 'rb') as handle:
    eliminated_columns = pickle.load(handle)

with open('data/scaler.pickle', 'rb') as handle:
    scaler = pickle.load(handle)
    
with open('data/encoder.pickle', 'rb') as handle:
    encoder = pickle.load(handle)
In [73]:
# define the feature values for the property
living_space = 80
rooms = 2.5
postcode = '8006'
floor = 'Ground floor'
property_type = 'Apartment'
public_transport = 100
motorway = 100
shop = 100
In [74]:
input_values = encode_input(living_space, rooms, postcode, floor, property_type, public_transport, motorway, shop, scaler, encoder)
In [75]:
input_values
Out[75]:
living_space rooms public_transport motorway shop property_postcode floor property_type 8000 8001 ... Single-family house Studio Terraced/row house Underground garage Villa scaled_living_space scaled_rooms scaled_public_transport scaled_motorway scaled_shop
0 80 2.5 100 100 100 8006 Ground floor Apartment 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.307777 0.011709 -0.699126 -4.496354 -1.341845

1 rows × 77 columns

In [76]:
# use one of: [regression_model, tree_model]
model_type = 'regression_model'
In [77]:
# calculate price
if model_type == 'regression_model':
    price = linear_pricing_model.predict(input_values.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop', 'property_postcode', 'floor', 'property_type'] + eliminated_columns))[0]
    mae = linear_pricing_model_mae
else:
    price = pricing_model.predict(input_values.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop', 'property_postcode', 'floor', 'property_type']))[0]
    mae = pricing_model_mae

calculated_price = pd.concat([pd.DataFrame([price], columns=['gross_rent']), input_values], axis=1)
In [78]:
calculated_price
Out[78]:
gross_rent living_space rooms public_transport motorway shop property_postcode floor property_type 8000 ... Single-family house Studio Terraced/row house Underground garage Villa scaled_living_space scaled_rooms scaled_public_transport scaled_motorway scaled_shop
0 2096.008855 80 2.5 100 100 100 8006 Ground floor Apartment 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.307777 0.011709 -0.699126 -4.496354 -1.341845

1 rows × 78 columns

In [79]:
print('Predicted Price =', price, '+/-', mae, 'CHF')
print('Price Range =', price - mae, 'to', price + mae, 'CHF')
Predicted Price = 2096.008855492157 +/- 329.87443140783836 CHF
Price Range = 1766.1344240843189 to 2425.8832868999957 CHF
In [80]:
# the predicted price of the property is shown as a red cross, and is plotted alongside properties that are in 
# it's peer group (i.e. properties that have the same number of rooms and the same property type)
fig = px.scatter(property_records[(property_records['rooms'] == rooms) & (property_records['property_type'] == property_type)], x="living_space", y="gross_rent", color="property_type", hover_data=['living_space'])
fig1 = px.scatter(calculated_price, x="living_space", y="gross_rent", title="Calculated Price vs Peer Group", hover_data=['property_postcode'])
fig1.update_traces(marker=dict(size=10, color='Red', symbol='x'))
fig.add_trace(fig1.data[0])
fig.show()
In [ ]: